{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "import boto3\n",
    "import sagemaker\n",
    "\n",
    "sagemaker_session = sagemaker.Session()\n",
    "role = sagemaker.get_execution_role()\n",
    "bucket = sagemaker_session.default_bucket()\n",
    "region = boto3.Session().region_name\n",
    "\n",
    "glue = boto3.Session().client(service_name='glue', region_name=region)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Must update the TrustPolicy of the notebook as follows:\n",
    "\n",
    "```\n",
    "    {\n",
    "      \"Sid\": \"\",\n",
    "      \"Effect\": \"Allow\",\n",
    "      \"Principal\": {\n",
    "        \"Service\": \"glue.amazonaws.com\"\n",
    "      },\n",
    "      \"Action\": \"sts:AssumeRole\"\n",
    "    }\n",
    "```"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "create_response = glue.create_crawler(\n",
    "    Name='amazon_reviews_crawler',\n",
    "    Role=role,\n",
    "    DatabaseName='dsoaws',\n",
    "    Description='Amazon Customer Reviews Dataset Crawler',\n",
    "    Targets={\n",
    "        'CatalogTargets': [\n",
    "            {\n",
    "                'DatabaseName': 'dsoaws',\n",
    "                'Tables': [\n",
    "                    'amazon_reviews_tsv',\n",
    "                ]\n",
    "            }\n",
    "        ]\n",
    "    },\n",
    "    Schedule='cron(59 23 * * ? *)', # run every night at 23:59 UTC\n",
    "     SchemaChangePolicy={\n",
    "         'DeleteBehavior': 'LOG' # |'DELETE_FROM_DATABASE'|'DEPRECATE_IN_DATABASE'\n",
    "     },\n",
    "     RecrawlPolicy={\n",
    "         'RecrawlBehavior': 'CRAWL_EVERYTHING' # 'CRAWL_NEW_FOLDERS_ONLY' for S3 Targets\n",
    "     }\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'ResponseMetadata': {'HTTPHeaders': {'connection': 'keep-alive',\n",
      "                                      'content-length': '2',\n",
      "                                      'content-type': 'application/x-amz-json-1.1',\n",
      "                                      'date': 'Sat, 26 Dec 2020 22:50:31 GMT',\n",
      "                                      'x-amzn-requestid': '7cb34ad1-6074-4c4e-92f8-74456b68a99e'},\n",
      "                      'HTTPStatusCode': 200,\n",
      "                      'RequestId': '7cb34ad1-6074-4c4e-92f8-74456b68a99e',\n",
      "                      'RetryAttempts': 0}}\n"
     ]
    }
   ],
   "source": [
    "from pprint import pprint\n",
    "pprint(create_response)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# TODO:  Copy a .tar.gz file from s3://amazon-reviews-pds/tsv/ to s3://{}/amazon-reviews-pds/tsv/"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# TODO:  Invoke the crawler, wait for the crawler to finish"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# TODO:  Query Athena and verify the new data shows up.  \n",
    "Note:  Make sure this works with new partitions of data.  We might need to run that MSCK command to pick up the new partitions (if the Crawler isn't doing this already.)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "conda_python3",
   "language": "python",
   "name": "conda_python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
